import pyodbc
import pandas as pd
import os
import pickle
import matplotlib.pyplot as plt
import numpy as np


df = pd.read_csv('/REDACTED/data/raw/pulledData/op_audit_costs_audit_data2014-2014_unmodified.csv')
#df = df[~df[tpivarb].isna()]

df =df[~df.total_pos_inc.isna()]
wage_dict_2010_to_2013 = {1:8.53,2:9.59,3:10.46,4:11.75,5:13.14,6:14.65,7:16.28,8:18.03,9:19.92,10:21.93,11:24.10,12:28.88,13:34.34,14:40.58,15:47.74}
wage_dict_2014 = {1:8.62,2:9.69,3:10.57,4:11.87,5:13.28,6:14.8,7:16.44,8:18.21,9:20.11,10:22.15,11:24.34,12:29.17,13:34.69,14:40.99,15:48.21}
df['wage2k14'] = df['exam_gr'].map(wage_dict_2014)
df['wagepre'] = df['exam_gr'].map(wage_dict_2010_to_2013)
df['wage'] = df['wagepre']
df.loc[df.tax_yr==2014,'wage'] = df.loc[df.tax_yr==2014,'wage2k14']
df = df.drop(columns=['wagepre'])
df['tot_extm'] = np.where(df['activity_code'] == 270, 18.3, df['tot_extm'])
df['tot_extm'] = np.where(df['activity_code'] == 271, 54.8, df['tot_extm'])
df['cost'] = df['wage2k14']*df['tot_extm']


#drop wrong eitc ACs
df['nunique_acs'] = df.groupby(['taxpayer_id','tax_period','master_file_code']).activity_code.transform('nunique')
df = df[df.nunique_acs==1]
df['wrong_AC'] = ((df.eic>0) & (df.activity_code>271)) | ((df.eic==0) & (df.activity_code<272))  
df = df[~df.activity_code<270].copy() 
no_wrong = df[~df.wrong_AC].copy()

aud_cost = df.groupby(['taxpayer_id','tax_period','master_file_code','activity_code']).agg({'cost':'sum','total_pos_inc':'mean'}).reset_index()
bins=[0, 6700,12500,18600,26137,35467,47073, 62027,86181,126994,1231794067]
binlabels = [1,2,3,4,5,6,7,8,9,10]
aud_cost['income_bucket'] = pd.cut(aud_cost.total_pos_inc,bins=bins,labels=binlabels,include_lowest=True,right=False)
ac_pctile_99 = aud_cost.groupby('activity_code').cost.quantile(0.99).to_dict()
ac_pctile_01 = aud_cost.groupby('activity_code').cost.quantile(0.01).to_dict()
aud_cost['q99'] = aud_cost.activity_code.map(ac_pctile_99)
aud_cost['q01'] = aud_cost.activity_code.map(ac_pctile_01)
aud_cost['cost_wins'] =aud_cost[['cost','q99']].min(axis=1)
aud_cost['cost_wins'] = aud_cost[['cost_wins','q01']].max(axis=1)



avg_cost = aud_cost.groupby(['income_bucket','activity_code']).cost.mean()
median_cost = aud_cost.groupby(['income_bucket','activity_code']).cost.median()
avg_cost_w = aud_cost.groupby(['income_bucket','activity_code']).cost_wins.mean()
median_cost_w = aud_cost.groupby(['income_bucket','activity_code']).cost_wins.median()
avg_cost.reset_index().to_csv('/REDACTED/data/metadata/mean_cost_decile_ac_V2.csv',index=False)
median_cost.reset_index().to_csv('/REDACTED/data/metadata/median_cost_decile_ac_V2.csv',index=False)
avg_cost_w.reset_index().to_csv('/REDACTED/data/metadata/mean_cost_decile_ac_wins_V2.csv',index=False)
median_cost_w.reset_index().to_csv('/REDACTED/data/metadata/median_cost_decile_ac_wins_V2.csv',index=False)

avg_cost_ac = aud_cost.groupby(['activity_code']).cost.mean()
median_cost_ac = aud_cost.groupby(['activity_code']).cost.median()
avg_cost_w_ac = aud_cost.groupby(['activity_code']).cost_wins.mean()
median_cost_w_ac = aud_cost.groupby(['activity_code']).cost_wins.median()
avg_cost_ac.reset_index().to_csv('/REDACTED/data/metadata/mean_cost_ac_V2.csv',index=False)
median_cost_ac.reset_index().to_csv('/REDACTED/data/metadata/median_cost_ac_V2.csv',index=False)
avg_cost_w_ac.reset_index().to_csv('/REDACTED/data/metadata/mean_cost_ac_wins_V2.csv',index=False)
median_cost_w_ac.reset_index().to_csv('/REDACTED/data/metadata/median_cost_ac_wins_V2.csv',index=False)

